import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from dataprep.eda import*
df=pd.read_csv("creditcard.csv")
df.head()
| Time | V1 | V2 | V3 | V4 | V5 | V6 | V7 | V8 | V9 | ... | V21 | V22 | V23 | V24 | V25 | V26 | V27 | V28 | Amount | Class | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.0 | -1.359807 | -0.072781 | 2.536347 | 1.378155 | -0.338321 | 0.462388 | 0.239599 | 0.098698 | 0.363787 | ... | -0.018307 | 0.277838 | -0.110474 | 0.066928 | 0.128539 | -0.189115 | 0.133558 | -0.021053 | 149.62 | 0 |
| 1 | 0.0 | 1.191857 | 0.266151 | 0.166480 | 0.448154 | 0.060018 | -0.082361 | -0.078803 | 0.085102 | -0.255425 | ... | -0.225775 | -0.638672 | 0.101288 | -0.339846 | 0.167170 | 0.125895 | -0.008983 | 0.014724 | 2.69 | 0 |
| 2 | 1.0 | -1.358354 | -1.340163 | 1.773209 | 0.379780 | -0.503198 | 1.800499 | 0.791461 | 0.247676 | -1.514654 | ... | 0.247998 | 0.771679 | 0.909412 | -0.689281 | -0.327642 | -0.139097 | -0.055353 | -0.059752 | 378.66 | 0 |
| 3 | 1.0 | -0.966272 | -0.185226 | 1.792993 | -0.863291 | -0.010309 | 1.247203 | 0.237609 | 0.377436 | -1.387024 | ... | -0.108300 | 0.005274 | -0.190321 | -1.175575 | 0.647376 | -0.221929 | 0.062723 | 0.061458 | 123.50 | 0 |
| 4 | 2.0 | -1.158233 | 0.877737 | 1.548718 | 0.403034 | -0.407193 | 0.095921 | 0.592941 | -0.270533 | 0.817739 | ... | -0.009431 | 0.798278 | -0.137458 | 0.141267 | -0.206010 | 0.502292 | 0.219422 | 0.215153 | 69.99 | 0 |
5 rows × 31 columns
df.shape
(284807, 31)
plot(df)
0%| | 0/3222 [00:00<?, ?it/s]
| Number of Variables | 31 |
|---|---|
| Number of Rows | 284807 |
| Missing Cells | 0 |
| Missing Cells (%) | 0.0% |
| Duplicate Rows | 1081 |
| Duplicate Rows (%) | 0.4% |
| Total Size in Memory | 67.4 MB |
| Average Row Size in Memory | 248.0 B |
| Variable Types |
|
| V2 and V5 have similar distributions | Similar Distribution |
|---|---|
| V2 and V11 have similar distributions | Similar Distribution |
| V2 and V13 have similar distributions | Similar Distribution |
| V5 and V9 have similar distributions | Similar Distribution |
| V5 and V11 have similar distributions | Similar Distribution |
| V5 and V13 have similar distributions | Similar Distribution |
| V5 and V15 have similar distributions | Similar Distribution |
| V7 and V13 have similar distributions | Similar Distribution |
| V7 and V15 have similar distributions | Similar Distribution |
| V7 and V16 have similar distributions | Similar Distribution |
| V7 and V18 have similar distributions | Similar Distribution |
|---|---|
| V7 and V19 have similar distributions | Similar Distribution |
| V7 and V22 have similar distributions | Similar Distribution |
| V9 and V10 have similar distributions | Similar Distribution |
| V13 and V15 have similar distributions | Similar Distribution |
| V14 and V16 have similar distributions | Similar Distribution |
| V14 and V18 have similar distributions | Similar Distribution |
| V14 and V19 have similar distributions | Similar Distribution |
| V15 and V16 have similar distributions | Similar Distribution |
| V16 and V18 have similar distributions | Similar Distribution |
| V16 and V19 have similar distributions | Similar Distribution |
|---|---|
| V18 and V19 have similar distributions | Similar Distribution |
| V18 and V22 have similar distributions | Similar Distribution |
| V19 and V22 have similar distributions | Similar Distribution |
| V20 and V21 have similar distributions | Similar Distribution |
| V1 is skewed | Skewed |
| V2 is skewed | Skewed |
| V3 is skewed | Skewed |
| V4 is skewed | Skewed |
| V5 is skewed | Skewed |
| V6 is skewed | Skewed |
|---|---|
| V7 is skewed | Skewed |
| V8 is skewed | Skewed |
| V9 is skewed | Skewed |
| V10 is skewed | Skewed |
| V12 is skewed | Skewed |
| V14 is skewed | Skewed |
| V16 is skewed | Skewed |
| V17 is skewed | Skewed |
| V18 is skewed | Skewed |
| V19 is skewed | Skewed |
|---|---|
| V20 is skewed | Skewed |
| V21 is skewed | Skewed |
| V22 is skewed | Skewed |
| V23 is skewed | Skewed |
| V24 is skewed | Skewed |
| V25 is skewed | Skewed |
| V27 is skewed | Skewed |
| V28 is skewed | Skewed |
| Amount is skewed | Skewed |
| Class has constant length 1 | Constant Length |
|---|---|
| V1 has 141456 (49.67%) negatives | Negatives |
| V2 has 134218 (47.13%) negatives | Negatives |
| V3 has 128163 (45.0%) negatives | Negatives |
| V4 has 144105 (50.6%) negatives | Negatives |
| V5 has 148928 (52.29%) negatives | Negatives |
| V6 has 176633 (62.02%) negatives | Negatives |
| V7 has 135852 (47.7%) negatives | Negatives |
| V8 has 135521 (47.58%) negatives | Negatives |
| V9 has 149455 (52.48%) negatives | Negatives |
| V10 has 159922 (56.15%) negatives | Negatives |
|---|---|
| V11 has 145632 (51.13%) negatives | Negatives |
| V12 has 121089 (42.52%) negatives | Negatives |
| V13 has 144123 (50.6%) negatives | Negatives |
| V14 has 133762 (46.97%) negatives | Negatives |
| V15 has 136234 (47.83%) negatives | Negatives |
| V16 has 132552 (46.54%) negatives | Negatives |
| V17 has 153635 (53.94%) negatives | Negatives |
| V18 has 142942 (50.19%) negatives | Negatives |
| V19 has 141804 (49.79%) negatives | Negatives |
| V20 has 169130 (59.38%) negatives | Negatives |
|---|---|
| V21 has 153196 (53.79%) negatives | Negatives |
| V22 has 141402 (49.65%) negatives | Negatives |
| V23 has 148400 (52.11%) negatives | Negatives |
| V24 has 127732 (44.85%) negatives | Negatives |
| V25 has 139302 (48.91%) negatives | Negatives |
| V26 has 151720 (53.27%) negatives | Negatives |
| V27 has 141004 (49.51%) negatives | Negatives |
| V28 has 125888 (44.2%) negatives | Negatives |
Number of plots per page:
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 284807 entries, 0 to 284806 Data columns (total 31 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Time 284807 non-null float64 1 V1 284807 non-null float64 2 V2 284807 non-null float64 3 V3 284807 non-null float64 4 V4 284807 non-null float64 5 V5 284807 non-null float64 6 V6 284807 non-null float64 7 V7 284807 non-null float64 8 V8 284807 non-null float64 9 V9 284807 non-null float64 10 V10 284807 non-null float64 11 V11 284807 non-null float64 12 V12 284807 non-null float64 13 V13 284807 non-null float64 14 V14 284807 non-null float64 15 V15 284807 non-null float64 16 V16 284807 non-null float64 17 V17 284807 non-null float64 18 V18 284807 non-null float64 19 V19 284807 non-null float64 20 V20 284807 non-null float64 21 V21 284807 non-null float64 22 V22 284807 non-null float64 23 V23 284807 non-null float64 24 V24 284807 non-null float64 25 V25 284807 non-null float64 26 V26 284807 non-null float64 27 V27 284807 non-null float64 28 V28 284807 non-null float64 29 Amount 284807 non-null float64 30 Class 284807 non-null int64 dtypes: float64(30), int64(1) memory usage: 67.4 MB
#check how many null values for each column
print(df.isna().sum())
Time 0 V1 0 V2 0 V3 0 V4 0 V5 0 V6 0 V7 0 V8 0 V9 0 V10 0 V11 0 V12 0 V13 0 V14 0 V15 0 V16 0 V17 0 V18 0 V19 0 V20 0 V21 0 V22 0 V23 0 V24 0 V25 0 V26 0 V27 0 V28 0 Amount 0 Class 0 dtype: int64
#Check number of duplicate values
df.duplicated().sum()
1081
#Remove duplicate values
df=df.drop_duplicates(keep='first')
df.duplicated().sum()
0
df_minimum=df.min(axis=0)
df_maximum=df.max(axis=0)
df_minimum
Time 0.000000 V1 -56.407510 V2 -72.715728 V3 -48.325589 V4 -5.683171 V5 -113.743307 V6 -26.160506 V7 -43.557242 V8 -73.216718 V9 -13.434066 V10 -24.588262 V11 -4.797473 V12 -18.683715 V13 -5.791881 V14 -19.214325 V15 -4.498945 V16 -14.129855 V17 -25.162799 V18 -9.498746 V19 -7.213527 V20 -54.497720 V21 -34.830382 V22 -10.933144 V23 -44.807735 V24 -2.836627 V25 -10.295397 V26 -2.604551 V27 -22.565679 V28 -15.430084 Amount 0.000000 Class 0.000000 dtype: float64
df_minmax=df_minimum.to_frame()
#rename column name
df_minmax= df_minmax.rename (columns = {0:'Minimum'})
df_minmax
| Minimum | |
|---|---|
| Time | 0.000000 |
| V1 | -56.407510 |
| V2 | -72.715728 |
| V3 | -48.325589 |
| V4 | -5.683171 |
| V5 | -113.743307 |
| V6 | -26.160506 |
| V7 | -43.557242 |
| V8 | -73.216718 |
| V9 | -13.434066 |
| V10 | -24.588262 |
| V11 | -4.797473 |
| V12 | -18.683715 |
| V13 | -5.791881 |
| V14 | -19.214325 |
| V15 | -4.498945 |
| V16 | -14.129855 |
| V17 | -25.162799 |
| V18 | -9.498746 |
| V19 | -7.213527 |
| V20 | -54.497720 |
| V21 | -34.830382 |
| V22 | -10.933144 |
| V23 | -44.807735 |
| V24 | -2.836627 |
| V25 | -10.295397 |
| V26 | -2.604551 |
| V27 | -22.565679 |
| V28 | -15.430084 |
| Amount | 0.000000 |
| Class | 0.000000 |
#add maximum column
df_minmax['Maximum']=df_maximum
df_minmax
| Minimum | Maximum | |
|---|---|---|
| Time | 0.000000 | 172792.000000 |
| V1 | -56.407510 | 2.454930 |
| V2 | -72.715728 | 22.057729 |
| V3 | -48.325589 | 9.382558 |
| V4 | -5.683171 | 16.875344 |
| V5 | -113.743307 | 34.801666 |
| V6 | -26.160506 | 73.301626 |
| V7 | -43.557242 | 120.589494 |
| V8 | -73.216718 | 20.007208 |
| V9 | -13.434066 | 15.594995 |
| V10 | -24.588262 | 23.745136 |
| V11 | -4.797473 | 12.018913 |
| V12 | -18.683715 | 7.848392 |
| V13 | -5.791881 | 7.126883 |
| V14 | -19.214325 | 10.526766 |
| V15 | -4.498945 | 8.877742 |
| V16 | -14.129855 | 17.315112 |
| V17 | -25.162799 | 9.253526 |
| V18 | -9.498746 | 5.041069 |
| V19 | -7.213527 | 5.591971 |
| V20 | -54.497720 | 39.420904 |
| V21 | -34.830382 | 27.202839 |
| V22 | -10.933144 | 10.503090 |
| V23 | -44.807735 | 22.528412 |
| V24 | -2.836627 | 4.584549 |
| V25 | -10.295397 | 7.519589 |
| V26 | -2.604551 | 3.517346 |
| V27 | -22.565679 | 31.612198 |
| V28 | -15.430084 | 33.847808 |
| Amount | 0.000000 | 25691.160000 |
| Class | 0.000000 | 1.000000 |
df.drop(["Time","Amount"], axis=1, inplace=True)
from sklearn.preprocessing import StandardScaler
X=df.iloc[:,:-1].values
Y=df.iloc[:,-1].values
X
array([[-1.35980713e+00, -7.27811733e-02, 2.53634674e+00, ...,
-1.89114844e-01, 1.33558377e-01, -2.10530535e-02],
[ 1.19185711e+00, 2.66150712e-01, 1.66480113e-01, ...,
1.25894532e-01, -8.98309914e-03, 1.47241692e-02],
[-1.35835406e+00, -1.34016307e+00, 1.77320934e+00, ...,
-1.39096572e-01, -5.53527940e-02, -5.97518406e-02],
...,
[ 1.91956501e+00, -3.01253846e-01, -3.24963981e+00, ...,
-8.73705959e-02, 4.45477214e-03, -2.65608286e-02],
[-2.40440050e-01, 5.30482513e-01, 7.02510230e-01, ...,
5.46668462e-01, 1.08820735e-01, 1.04532821e-01],
[-5.33412522e-01, -1.89733337e-01, 7.03337367e-01, ...,
-8.18267121e-01, -2.41530880e-03, 1.36489143e-02]])
Y
array([0, 0, 0, ..., 0, 0, 0], dtype=int64)
#standardization
scale=StandardScaler().fit_transform(X)
scale
array([[-0.70108232, -0.04168726, 1.68010106, ..., -0.39262206,
0.33303251, -0.06584955],
[ 0.60879165, 0.16413764, 0.10927902, ..., 0.26085374,
-0.0271543 , 0.0432187 ],
[-0.7003364 , -0.81133678, 1.17426962, ..., -0.28886092,
-0.14432548, -0.18382429],
...,
[ 0.98235398, -0.18043304, -2.15503257, ..., -0.18155722,
0.00680174, -0.08264021],
[-0.12646526, 0.32465977, 0.46457662, ..., 1.13373436,
0.27052318, 0.31700384],
[-0.27686005, -0.1127094 , 0.46512487, ..., -1.69777619,
-0.01055821, 0.03994074]])
from sklearn.model_selection import train_test_split
x_train, x_test,y_train,y_test=train_test_split(X,Y,test_size=0.2, random_state=100)
from sklearn.tree import DecisionTreeClassifier
#create decision tree classifier object
clf=DecisionTreeClassifier()
#Train decision tree classifier
clf=clf.fit(x_train,y_train)
#predict the response for test dataset
y_pred=clf.predict(x_test)
from sklearn import metrics #for accuracy calculation
print("Accuracy : ", metrics.accuracy_score(y_test,y_pred))
Accuracy : 0.9988721672012124